import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.read_csv(r'C:\Users\ranjith valthaje\Project_uber/uber-raw-data-janjune-15.csv',encoding ='utf-8')
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 |
| ... | ... | ... | ... | ... |
| 14270474 | B02765 | 2015-05-08 15:43:00 | B02765 | 186 |
| 14270475 | B02765 | 2015-05-08 15:43:00 | B02765 | 263 |
| 14270476 | B02765 | 2015-05-08 15:43:00 | B02765 | 90 |
| 14270477 | B02765 | 2015-05-08 15:44:00 | B01899 | 45 |
| 14270478 | B02765 | 2015-05-08 15:44:00 | B02682 | 144 |
14270479 rows × 4 columns
uber_15 = pd.read_csv(r'C:\Users\ranjith valthaje\Project_uber/uber-raw-data-janjune-15.csv',encoding ='utf-8')
uber_15
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 |
| ... | ... | ... | ... | ... |
| 14270474 | B02765 | 2015-05-08 15:43:00 | B02765 | 186 |
| 14270475 | B02765 | 2015-05-08 15:43:00 | B02765 | 263 |
| 14270476 | B02765 | 2015-05-08 15:43:00 | B02765 | 90 |
| 14270477 | B02765 | 2015-05-08 15:44:00 | B01899 | 45 |
| 14270478 | B02765 | 2015-05-08 15:44:00 | B02682 | 144 |
14270479 rows × 4 columns
uber_15.shape
(14270479, 4)
uber_15.head(5)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 |
uber_15.tail(5)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 14270474 | B02765 | 2015-05-08 15:43:00 | B02765 | 186 |
| 14270475 | B02765 | 2015-05-08 15:43:00 | B02765 | 263 |
| 14270476 | B02765 | 2015-05-08 15:43:00 | B02765 | 90 |
| 14270477 | B02765 | 2015-05-08 15:44:00 | B01899 | 45 |
| 14270478 | B02765 | 2015-05-08 15:44:00 | B02682 | 144 |
uber_15.duplicated().sum()
898225
uber_15.drop_duplicates(inplace = True)
uber_15.shape
(13372254, 4)
uber_15.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
pd.to_datetime(uber_15['Pickup_date'], format = '%Y-%m-%d %H:%M:%S')
0 2015-05-17 09:47:00
1 2015-05-17 09:47:00
2 2015-05-17 09:47:00
3 2015-05-17 09:47:00
4 2015-05-17 09:47:00
...
14270474 2015-05-08 15:43:00
14270475 2015-05-08 15:43:00
14270476 2015-05-08 15:43:00
14270477 2015-05-08 15:44:00
14270478 2015-05-08 15:44:00
Name: Pickup_date, Length: 13372254, dtype: datetime64[ns]
uber_15_Pickup_date = pd.to_datetime(uber_15['Pickup_date'], format = '%Y-%m-%d %H:%M:%S')
uber_15_Pickup_date.dtype
dtype('<M8[ns]')
uber_15_Pickup_date
0 2015-05-17 09:47:00
1 2015-05-17 09:47:00
2 2015-05-17 09:47:00
3 2015-05-17 09:47:00
4 2015-05-17 09:47:00
...
14270474 2015-05-08 15:43:00
14270475 2015-05-08 15:43:00
14270476 2015-05-08 15:43:00
14270477 2015-05-08 15:44:00
14270478 2015-05-08 15:44:00
Name: Pickup_date, Length: 13372254, dtype: datetime64[ns]
uber_15_Pickup_date.dt.month
0 5
1 5
2 5
3 5
4 5
..
14270474 5
14270475 5
14270476 5
14270477 5
14270478 5
Name: Pickup_date, Length: 13372254, dtype: int64
uber_15_month = uber_15_Pickup_date.dt.month
uber_15_month
0 5
1 5
2 5
3 5
4 5
..
14270474 5
14270475 5
14270476 5
14270477 5
14270478 5
Name: Pickup_date, Length: 13372254, dtype: int64
uber_15_month.value_counts()
6 2571771 5 2483980 2 2222189 4 2112705 3 2062639 1 1918970 Name: Pickup_date, dtype: int64
uber_15_month.value_counts().plot(kind = 'bar')
<AxesSubplot:>
uber_15_month.value_counts().plot(kind = 'bar', figsize = (10,5))
<AxesSubplot:>
uber_15['weekday'] = uber_15_Pickup_date.dt.day_name()
uber_15['day'] = uber_15_Pickup_date.dt.day
uber_15['hour'] = uber_15_Pickup_date.dt.hour
uber_15['month'] = uber_15_Pickup_date.dt.month
uber_15['minute'] = uber_15_Pickup_date.dt.minute
uber_15.head(5)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | weekday | day | hour | month | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 | Sunday | 17 | 9 | 5 | 47 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 | Sunday | 17 | 9 | 5 | 47 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 | Sunday | 17 | 9 | 5 | 47 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 | Sunday | 17 | 9 | 5 | 47 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 | Sunday | 17 | 9 | 5 | 47 |
uber_15.groupby(['month', 'weekday']).size()
month weekday
1 Friday 339285
Monday 190606
Saturday 386049
Sunday 230487
Thursday 330319
Tuesday 196574
Wednesday 245650
2 Friday 373550
Monday 274948
Saturday 368311
Sunday 296130
Thursday 335603
Tuesday 287260
Wednesday 286387
3 Friday 309631
Monday 269931
Saturday 314785
Sunday 313865
Thursday 277026
Tuesday 320634
Wednesday 256767
4 Friday 315002
Monday 238429
Saturday 324545
Sunday 273560
Thursday 372522
Tuesday 250632
Wednesday 338015
5 Friday 430134
Monday 255501
Saturday 464298
Sunday 390391
Thursday 337607
Tuesday 290004
Wednesday 316045
6 Friday 371225
Monday 375312
Saturday 399377
Sunday 334434
Thursday 357782
Tuesday 405500
Wednesday 328141
dtype: int64
type(uber_15.groupby(['month', 'weekday']).size())
pandas.core.series.Series
uber_15.groupby(['month', 'weekday'], as_index = False).size()
| month | weekday | size | |
|---|---|---|---|
| 0 | 1 | Friday | 339285 |
| 1 | 1 | Monday | 190606 |
| 2 | 1 | Saturday | 386049 |
| 3 | 1 | Sunday | 230487 |
| 4 | 1 | Thursday | 330319 |
| 5 | 1 | Tuesday | 196574 |
| 6 | 1 | Wednesday | 245650 |
| 7 | 2 | Friday | 373550 |
| 8 | 2 | Monday | 274948 |
| 9 | 2 | Saturday | 368311 |
| 10 | 2 | Sunday | 296130 |
| 11 | 2 | Thursday | 335603 |
| 12 | 2 | Tuesday | 287260 |
| 13 | 2 | Wednesday | 286387 |
| 14 | 3 | Friday | 309631 |
| 15 | 3 | Monday | 269931 |
| 16 | 3 | Saturday | 314785 |
| 17 | 3 | Sunday | 313865 |
| 18 | 3 | Thursday | 277026 |
| 19 | 3 | Tuesday | 320634 |
| 20 | 3 | Wednesday | 256767 |
| 21 | 4 | Friday | 315002 |
| 22 | 4 | Monday | 238429 |
| 23 | 4 | Saturday | 324545 |
| 24 | 4 | Sunday | 273560 |
| 25 | 4 | Thursday | 372522 |
| 26 | 4 | Tuesday | 250632 |
| 27 | 4 | Wednesday | 338015 |
| 28 | 5 | Friday | 430134 |
| 29 | 5 | Monday | 255501 |
| 30 | 5 | Saturday | 464298 |
| 31 | 5 | Sunday | 390391 |
| 32 | 5 | Thursday | 337607 |
| 33 | 5 | Tuesday | 290004 |
| 34 | 5 | Wednesday | 316045 |
| 35 | 6 | Friday | 371225 |
| 36 | 6 | Monday | 375312 |
| 37 | 6 | Saturday | 399377 |
| 38 | 6 | Sunday | 334434 |
| 39 | 6 | Thursday | 357782 |
| 40 | 6 | Tuesday | 405500 |
| 41 | 6 | Wednesday | 328141 |
temp = uber_15.groupby(['month', 'weekday'], as_index = False).size()
temp.head()
| month | weekday | size | |
|---|---|---|---|
| 0 | 1 | Friday | 339285 |
| 1 | 1 | Monday | 190606 |
| 2 | 1 | Saturday | 386049 |
| 3 | 1 | Sunday | 230487 |
| 4 | 1 | Thursday | 330319 |
temp['month'].unique()
array([1, 2, 3, 4, 5, 6], dtype=int64)
dict_month = {1:'Jan', 2:'Feb', 3:'March', 4:'April', 5:'May', 6:'June'}
temp['month'].map(dict_month)
0 Jan 1 Jan 2 Jan 3 Jan 4 Jan 5 Jan 6 Jan 7 Feb 8 Feb 9 Feb 10 Feb 11 Feb 12 Feb 13 Feb 14 March 15 March 16 March 17 March 18 March 19 March 20 March 21 April 22 April 23 April 24 April 25 April 26 April 27 April 28 May 29 May 30 May 31 May 32 May 33 May 34 May 35 June 36 June 37 June 38 June 39 June 40 June 41 June Name: month, dtype: object
temp_month = temp['month'].map(dict_month)
temp_month
0 Jan 1 Jan 2 Jan 3 Jan 4 Jan 5 Jan 6 Jan 7 Feb 8 Feb 9 Feb 10 Feb 11 Feb 12 Feb 13 Feb 14 March 15 March 16 March 17 March 18 March 19 March 20 March 21 April 22 April 23 April 24 April 25 April 26 April 27 April 28 May 29 May 30 May 31 May 32 May 33 May 34 May 35 June 36 June 37 June 38 June 39 June 40 June 41 June Name: month, dtype: object
temp
| month | weekday | size | |
|---|---|---|---|
| 0 | 1 | Friday | 339285 |
| 1 | 1 | Monday | 190606 |
| 2 | 1 | Saturday | 386049 |
| 3 | 1 | Sunday | 230487 |
| 4 | 1 | Thursday | 330319 |
| 5 | 1 | Tuesday | 196574 |
| 6 | 1 | Wednesday | 245650 |
| 7 | 2 | Friday | 373550 |
| 8 | 2 | Monday | 274948 |
| 9 | 2 | Saturday | 368311 |
| 10 | 2 | Sunday | 296130 |
| 11 | 2 | Thursday | 335603 |
| 12 | 2 | Tuesday | 287260 |
| 13 | 2 | Wednesday | 286387 |
| 14 | 3 | Friday | 309631 |
| 15 | 3 | Monday | 269931 |
| 16 | 3 | Saturday | 314785 |
| 17 | 3 | Sunday | 313865 |
| 18 | 3 | Thursday | 277026 |
| 19 | 3 | Tuesday | 320634 |
| 20 | 3 | Wednesday | 256767 |
| 21 | 4 | Friday | 315002 |
| 22 | 4 | Monday | 238429 |
| 23 | 4 | Saturday | 324545 |
| 24 | 4 | Sunday | 273560 |
| 25 | 4 | Thursday | 372522 |
| 26 | 4 | Tuesday | 250632 |
| 27 | 4 | Wednesday | 338015 |
| 28 | 5 | Friday | 430134 |
| 29 | 5 | Monday | 255501 |
| 30 | 5 | Saturday | 464298 |
| 31 | 5 | Sunday | 390391 |
| 32 | 5 | Thursday | 337607 |
| 33 | 5 | Tuesday | 290004 |
| 34 | 5 | Wednesday | 316045 |
| 35 | 6 | Friday | 371225 |
| 36 | 6 | Monday | 375312 |
| 37 | 6 | Saturday | 399377 |
| 38 | 6 | Sunday | 334434 |
| 39 | 6 | Thursday | 357782 |
| 40 | 6 | Tuesday | 405500 |
| 41 | 6 | Wednesday | 328141 |
plt.figure(figsize = (12,8))
sns.barplot(x ='month', y = 'size', hue = 'weekday', data = temp)
<AxesSubplot:xlabel='month', ylabel='size'>
summary = uber_15.groupby(['weekday','hour'], as_index = False).size()
summary
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 79879 |
| 1 | Friday | 1 | 44563 |
| 2 | Friday | 2 | 27252 |
| 3 | Friday | 3 | 19076 |
| 4 | Friday | 4 | 23049 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 131317 |
| 164 | Wednesday | 20 | 123490 |
| 165 | Wednesday | 21 | 120941 |
| 166 | Wednesday | 22 | 115208 |
| 167 | Wednesday | 23 | 91631 |
168 rows × 3 columns
plt.figure(figsize = (12,8))
sns.pointplot(x ='hour', y ='size', hue ='weekday', data = summary)
<AxesSubplot:xlabel='hour', ylabel='size'>
pd.read_csv(r'C:\Users\ranjith valthaje\Project_uber/Uber-Jan-Feb-FOIL.csv')
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
| ... | ... | ... | ... | ... |
| 349 | B02764 | 2/28/2015 | 3952 | 39812 |
| 350 | B02617 | 2/28/2015 | 1372 | 14022 |
| 351 | B02682 | 2/28/2015 | 1386 | 14472 |
| 352 | B02512 | 2/28/2015 | 230 | 1803 |
| 353 | B02765 | 2/28/2015 | 747 | 7753 |
354 rows × 4 columns
uber_foil = pd.read_csv(r'C:\Users\ranjith valthaje\Project_uber/Uber-Jan-Feb-FOIL.csv')
uber_foil.head()
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
!pip install chart_studio
!pip install plotly
Requirement already satisfied: chart_studio in c:\python\lib\site-packages (1.1.0) Requirement already satisfied: six in c:\python\lib\site-packages (from chart_studio) (1.16.0) Requirement already satisfied: retrying>=1.3.3 in c:\python\lib\site-packages (from chart_studio) (1.3.4) Requirement already satisfied: plotly in c:\python\lib\site-packages (from chart_studio) (5.9.0) Requirement already satisfied: requests in c:\python\lib\site-packages (from chart_studio) (2.28.1) Requirement already satisfied: tenacity>=6.2.0 in c:\python\lib\site-packages (from plotly->chart_studio) (8.0.1) Requirement already satisfied: charset-normalizer<3,>=2 in c:\python\lib\site-packages (from requests->chart_studio) (2.0.4) Requirement already satisfied: certifi>=2017.4.17 in c:\python\lib\site-packages (from requests->chart_studio) (2022.9.14) Requirement already satisfied: idna<4,>=2.5 in c:\python\lib\site-packages (from requests->chart_studio) (3.3) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\python\lib\site-packages (from requests->chart_studio) (1.26.11) Requirement already satisfied: plotly in c:\python\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\python\lib\site-packages (from plotly) (8.0.1)
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs, plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
px.box(x ='dispatching_base_number', y = 'active_vehicles', data_frame = uber_foil)
px.violin(x = 'dispatching_base_number', y = 'active_vehicles', data_frame = uber_foil)
import os
os.listdir(r'C:\Users\ranjith valthaje\Project_uber')
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files = os.listdir(r'C:\Users\ranjith valthaje\Project_uber')[-7:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
path = r'C:\Users\ranjith valthaje\Project_uber'
final = pd.DataFrame()
for file in files:
current_df= pd.read_csv(path+'/'+file,encoding= 'utf-8')
final = pd.concat([current_df,final])
final.shape
(4534327, 4)
final.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
final.duplicated().sum()
82581
final.drop_duplicates(inplace = True)
final.shape
(4451746, 4)
final.groupby(['Lat', 'Lon']).size()
Lat Lon
39.6569 -74.2258 1
39.6686 -74.1607 1
39.7214 -74.2446 1
39.8416 -74.1512 1
39.9055 -74.0791 1
..
41.3730 -72.9237 1
41.3737 -73.7988 1
41.5016 -72.8987 1
41.5276 -72.7734 1
42.1166 -72.0666 1
Length: 574558, dtype: int64
final.groupby(['Lat', 'Lon'],as_index = False).size()
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
rush_uber = final.groupby(['Lat', 'Lon'],as_index = False).size()
rush_uber
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
#!pip install folium
import folium
basemap = folium.Map()
from folium.plugins import HeatMap
HeatMap(rush_uber).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x1556c9d2bb0>
basemap